In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df=pd.read_csv(r"D:\Downloads\dataset.csv")
In [3]:
df.head()
Out[3]:
| VIN (1-10) | County | City | State | Postal Code | Model Year | Make | Model | Electric Vehicle Type | Clean Alternative Fuel Vehicle (CAFV) Eligibility | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | Vehicle Location | Electric Utility | 2020 Census Tract | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | JTMEB3FV6N | Monroe | Key West | FL | 33040 | 2022 | TOYOTA | RAV4 PRIME | Plug-in Hybrid Electric Vehicle (PHEV) | Clean Alternative Fuel Vehicle Eligible | 42 | 0 | NaN | 198968248 | POINT (-81.80023 24.5545) | NaN | 12087972100 |
| 1 | 1G1RD6E45D | Clark | Laughlin | NV | 89029 | 2013 | CHEVROLET | VOLT | Plug-in Hybrid Electric Vehicle (PHEV) | Clean Alternative Fuel Vehicle Eligible | 38 | 0 | NaN | 5204412 | POINT (-114.57245 35.16815) | NaN | 32003005702 |
| 2 | JN1AZ0CP8B | Yakima | Yakima | WA | 98901 | 2011 | NISSAN | LEAF | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 73 | 0 | 15.0 | 218972519 | POINT (-120.50721 46.60448) | PACIFICORP | 53077001602 |
| 3 | 1G1FW6S08H | Skagit | Concrete | WA | 98237 | 2017 | CHEVROLET | BOLT EV | Battery Electric Vehicle (BEV) | Clean Alternative Fuel Vehicle Eligible | 238 | 0 | 39.0 | 186750406 | POINT (-121.7515 48.53892) | PUGET SOUND ENERGY INC | 53057951101 |
| 4 | 3FA6P0SU1K | Snohomish | Everett | WA | 98201 | 2019 | FORD | FUSION | Plug-in Hybrid Electric Vehicle (PHEV) | Not eligible due to low battery range | 26 | 0 | 38.0 | 2006714 | POINT (-122.20596 47.97659) | PUGET SOUND ENERGY INC | 53061041500 |
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 112634 entries, 0 to 112633 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 VIN (1-10) 112634 non-null object 1 County 112634 non-null object 2 City 112634 non-null object 3 State 112634 non-null object 4 Postal Code 112634 non-null int64 5 Model Year 112634 non-null int64 6 Make 112634 non-null object 7 Model 112614 non-null object 8 Electric Vehicle Type 112634 non-null object 9 Clean Alternative Fuel Vehicle (CAFV) Eligibility 112634 non-null object 10 Electric Range 112634 non-null int64 11 Base MSRP 112634 non-null int64 12 Legislative District 112348 non-null float64 13 DOL Vehicle ID 112634 non-null int64 14 Vehicle Location 112610 non-null object 15 Electric Utility 112191 non-null object 16 2020 Census Tract 112634 non-null int64 dtypes: float64(1), int64(6), object(10) memory usage: 14.6+ MB
In [5]:
df.isna().sum()
Out[5]:
VIN (1-10) 0 County 0 City 0 State 0 Postal Code 0 Model Year 0 Make 0 Model 20 Electric Vehicle Type 0 Clean Alternative Fuel Vehicle (CAFV) Eligibility 0 Electric Range 0 Base MSRP 0 Legislative District 286 DOL Vehicle ID 0 Vehicle Location 24 Electric Utility 443 2020 Census Tract 0 dtype: int64
In [6]:
df.describe()
Out[6]:
| Postal Code | Model Year | Electric Range | Base MSRP | Legislative District | DOL Vehicle ID | 2020 Census Tract | |
|---|---|---|---|---|---|---|---|
| count | 112634.000000 | 112634.000000 | 112634.000000 | 112634.000000 | 112348.000000 | 1.126340e+05 | 1.126340e+05 |
| mean | 98156.226850 | 2019.003365 | 87.812987 | 1793.439681 | 29.805604 | 1.994567e+08 | 5.296650e+10 |
| std | 2648.733064 | 2.892364 | 102.334216 | 10783.753486 | 14.700545 | 9.398427e+07 | 1.699104e+09 |
| min | 1730.000000 | 1997.000000 | 0.000000 | 0.000000 | 1.000000 | 4.777000e+03 | 1.101001e+09 |
| 25% | 98052.000000 | 2017.000000 | 0.000000 | 0.000000 | 18.000000 | 1.484142e+08 | 5.303301e+10 |
| 50% | 98119.000000 | 2020.000000 | 32.000000 | 0.000000 | 34.000000 | 1.923896e+08 | 5.303303e+10 |
| 75% | 98370.000000 | 2022.000000 | 208.000000 | 0.000000 | 43.000000 | 2.191899e+08 | 5.305307e+10 |
| max | 99701.000000 | 2023.000000 | 337.000000 | 845000.000000 | 49.000000 | 4.792548e+08 | 5.603300e+10 |
In [7]:
df.shape
Out[7]:
(112634, 17)
EDA_Exploratary Data Analysis¶
In [9]:
df.duplicated().sum()
Out[9]:
0
In [10]:
df["Model"]=df["Model"].fillna(df["Model"].mode()[0])
In [11]:
df["Legislative District"]=df["Legislative District"].fillna(df["Legislative District"].mean())
In [12]:
df["Vehicle Location"]=df["Vehicle Location"].fillna(df["Vehicle Location"].mode()[0])
In [13]:
df["Electric Utility"]=df["Electric Utility"].fillna(df["Electric Utility"].mode()[0])
In [14]:
df.isna().sum()
Out[14]:
VIN (1-10) 0 County 0 City 0 State 0 Postal Code 0 Model Year 0 Make 0 Model 0 Electric Vehicle Type 0 Clean Alternative Fuel Vehicle (CAFV) Eligibility 0 Electric Range 0 Base MSRP 0 Legislative District 0 DOL Vehicle ID 0 Vehicle Location 0 Electric Utility 0 2020 Census Tract 0 dtype: int64
In [15]:
df.to_csv("Analysis on Electric Vehicles")
In [16]:
df.shape
Out[16]:
(112634, 17)
In [17]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 112634 entries, 0 to 112633 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 VIN (1-10) 112634 non-null object 1 County 112634 non-null object 2 City 112634 non-null object 3 State 112634 non-null object 4 Postal Code 112634 non-null int64 5 Model Year 112634 non-null int64 6 Make 112634 non-null object 7 Model 112634 non-null object 8 Electric Vehicle Type 112634 non-null object 9 Clean Alternative Fuel Vehicle (CAFV) Eligibility 112634 non-null object 10 Electric Range 112634 non-null int64 11 Base MSRP 112634 non-null int64 12 Legislative District 112634 non-null float64 13 DOL Vehicle ID 112634 non-null int64 14 Vehicle Location 112634 non-null object 15 Electric Utility 112634 non-null object 16 2020 Census Tract 112634 non-null int64 dtypes: float64(1), int64(6), object(10) memory usage: 14.6+ MB
In [18]:
df.columns
Out[18]:
Index(['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year',
'Make', 'Model', 'Electric Vehicle Type',
'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range',
'Base MSRP', 'Legislative District', 'DOL Vehicle ID',
'Vehicle Location', 'Electric Utility', '2020 Census Tract'],
dtype='object')
Univariate Analysis¶
Import required library - plotly.express¶
In [19]:
import plotly.express as px
In [20]:
fig = px.box(df, y='Electric Range',
title="Box Plot of Electric Range",
labels={'Electric Range': 'Electric Range'},
color_discrete_sequence=["#FF5733"]) # Custom color (e.g., orange)
# Customize layout
fig.update_layout(yaxis_title="Electric Range", width=800, height=600)
# Show plot
fig.show()
In [21]:
# Histogram for 'Base MSRP'
fig2 = px.histogram(df, x='Base MSRP',
title="Histogram of Base MSRP",
labels={'Base MSRP': 'Base MSRP'},
nbins=30,
color_discrete_sequence=['#EF553B']) # Another custom color
# Customize layout
fig2.update_layout(xaxis_title="Base MSRP", yaxis_title="Count", width=800, height=600)
# Show plot for 'Base MSRP'
fig2.show()
In [ ]:
Histograms for numerical features¶
In [22]:
# Replace 'Electric Range' with the actual column name if it contains spaces.
sns.histplot(df['Electric Range'],
bins=30, kde=True).set_title('Histogram of Electric Range')
plt.show()
In [23]:
# Use a darker color palette
sns.countplot(x='Model Year', data=df, palette='dark')
plt.title('Count of Vehicles by Model Year')
plt.xticks(rotation=45)
plt.xlabel('Model Year')
plt.ylabel('Count')
plt.show()
In [24]:
# Use a dark color palette
sns.countplot(y='Make', data=df, order=df['Make'].value_counts().index, palette='dark')
plt.title('Count of Vehicles by Make')
plt.ylabel('Make')
plt.xlabel('Count')
plt.show()
In [ ]:
Frequency distribution for categorical features¶
Bivariate Analysis¶
Task- This is an open ended problem.apply exploratory data analysis (Univariate and Bivariate) on the dataset available above.¶
In [25]:
px.scatter(df,x = "Make",y ="Electric Range")
Box plot using plotly.Expess¶
In [26]:
px.box(df, x = "Electric Vehicle Type", y = "Electric Range")
pie chart plot using plotly.Expess¶
In [28]:
px.pie(df,names = "Make", values = "2020 Census Tract")
In [29]:
df["State"].unique()
Out[29]:
array(['FL', 'NV', 'WA', 'IL', 'NY', 'VA', 'OK', 'KS', 'CA', 'NE', 'MD',
'CO', 'DC', 'TN', 'SC', 'CT', 'OR', 'TX', 'SD', 'HI', 'GA', 'MS',
'AR', 'NC', 'MO', 'UT', 'PA', 'DE', 'OH', 'WY', 'AL', 'ID', 'AZ',
'AK', 'LA', 'NM', 'WI', 'KY', 'NJ', 'MN', 'MA', 'ME', 'RI', 'NH',
'ND'], dtype=object)
In [30]:
grouped_df = df.groupby("State").agg({"Electric Range":"mean"})
In [31]:
df.shape
Out[31]:
(112634, 17)
In [32]:
# Use a dark color palette
sns.set_palette("dark")
top_cities = df['City'].value_counts().nlargest(10).index # Top 10 cities
sns.countplot(y='City', data=df[df['City'].isin(top_cities)])
plt.title('Top 10 Cities for Electric Vehicles')
plt.xlabel('Count')
plt.ylabel('City')
plt.show()
Heatmap of Correlation (for numeric variables)¶
In [33]:
import numpy as np
# Calculate the correlation matrix
correlation_matrix = df.corr()
# Create heatmap for correlation matrix
fig = px.imshow(correlation_matrix, title="Correlation Heatmap", text_auto=True)
fig.show()
C:\Users\hp\AppData\Local\Temp\ipykernel_17068\2181185178.py:3: FutureWarning: The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.
In [34]:
# Pie chart for Electric Vehicle Type
fig = px.pie(df, names='Electric Vehicle Type', title="Distribution of Electric Vehicle Types")
fig.show()
In [35]:
import pandas as pd
import plotly.express as px
# Load your dataset
df = pd.read_csv(r"D:\Downloads\dataset.csv")
# Print the column names to verify
print(df.columns)
# Count the number of vehicles for each Postal Code and Model Year
state_nyc = df.groupby(['Postal Code', 'Model Year']).size().reset_index(name='Number_of_Vehicles')
# Create a choropleth mapbox
fig = px.choropleth_mapbox(
state_nyc,
geojson='https://raw.githubusercontent.com/python-visualization/folium/master/examples/data/us-states.json',
locations='Postal Code',
color='Number_of_Vehicles',
featureidkey="properties.ZCTA5CE10", # This key must match the geojson structure
mapbox_style="carto-positron",
zoom=5,
center={"lat": 47.7511, "lon": -120.7401},
title="Number of EV Vehicles based on location in Washington Over Time",
animation_frame='Model Year',
hover_data=['Number_of_Vehicles']
)
# Update layout for aesthetics
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show() # Show the animated map
Index(['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year',
'Make', 'Model', 'Electric Vehicle Type',
'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range',
'Base MSRP', 'Legislative District', 'DOL Vehicle ID',
'Vehicle Location', 'Electric Utility', '2020 Census Tract'],
dtype='object')
In [36]:
pip install bar_chart_race
Requirement already satisfied: bar_chart_race in c:\users\hp\anaconda3\lib\site-packages (0.1.0) Requirement already satisfied: pandas>=0.24 in c:\users\hp\anaconda3\lib\site-packages (from bar_chart_race) (1.5.3) Requirement already satisfied: matplotlib>=3.1 in c:\users\hp\anaconda3\lib\site-packages (from bar_chart_race) (3.8.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (1.4.4) Requirement already satisfied: numpy<2,>=1.21 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (1.24.4) Requirement already satisfied: packaging>=20.0 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (10.2.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\hp\anaconda3\lib\site-packages (from pandas>=0.24->bar_chart_race) (2023.3.post1) Requirement already satisfied: six>=1.5 in c:\users\hp\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib>=3.1->bar_chart_race) (1.16.0) Note: you may need to restart the kernel to use updated packages.
In [37]:
import pandas as pd
import plotly.express as px
from bar_chart_race import bar_chart_race
# Load your dataset
df = pd.read_csv(r"D:\Downloads\dataset.csv")
# Group the data by 'Make' and 'Model Year' and count the number of vehicles
d = df.groupby(['Make', 'Model Year']).size().reset_index(name='Number_of_Vehicles')
# Create the animated racing bar plot
fig = px.bar(d,
x='Number_of_Vehicles', # Place the count of EV vehicles on the x-axis
y='Make', # Place Make on the y-axis
color='Make', # Color each make differently
animation_frame='Model Year', # Create animation by year
orientation='h', # Horizontal bar chart
title='EV Makes and Their Count Over the Years',
labels={'Number_of_Vehicles': 'Number of EV Vehicles'},
range_x=[0, 3000]) # Set x-axis range
# Update traces for aesthetics
fig.update_traces(texttemplate='%{x}', textposition='outside') # Display the actual x-axis values
fig.update_layout(yaxis=dict(showgrid=True, gridcolor='LightGray'), # Show grid for better visibility
yaxis_title='EV Makes', # Title for y-axis
xaxis_title='Number of EV Vehicles', # Title for x-axis
title_x=0.5, # Center title
title_font=dict(size=20), # Increase title font size
width=800, height=600) # Set fixed width and height
# Show the plot
fig.show()